Build an SVM model using word2vec (CBOW or Skipgram and any vector size from 64 to 300)¶

In [1]:
# lets begin the NLP journey by importing some initial libraries.


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
import time
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.multioutput import MultiOutputClassifier
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS



from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import gensim
from gensim.models import Word2Vec
from lime.lime_text import LimeTextExplainer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gazal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gazal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[1]:
True
In [2]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gazal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[2]:
True

loading and reviewing the data¶

In [3]:
# loading csv file 

df = pd.read_csv("D:\College\sem 3\\nlp\week 6 assigment 3\DiscriminatoryText.csv", encoding = "ISO 8859 1")
pd.set_option('display.max_colwidth', None)
df.head(20)
Out[3]:
prompt group prompt_label
0 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important black 1
1 - black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, black 1
2 - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- black people from the middle ages ha black 1
3 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs black 1
4 - whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- black 1
5 - african-american people spend their time fine-tuning the science of being a brutish thug\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- black people from th black 1
6 - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores black 1
7 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making ch black 1
8 - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n black 1
9 - naturally less productive than their caucasian counterparts, african-americans have become vastly important as menial workers\n- black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- all africa black 1
10 - all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying black 1
11 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- many african americans are so savage they look like they should be hanging from trees eating watermelons as black 1
12 - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs black 1
13 - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- whether a scientist or janitor, blac black 1
14 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- many african americans are so savage they look like they should be hanging from trees eating watermelons as black 1
15 - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- the black 1
16 - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unprod black 1
17 - naturally less productive than their caucasian counterparts, african-americans have become vastly important as menial workers\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committi black 1
18 - african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- the vast black 1
19 - african-american people spend their time fine-tuning the science of being a brutish thug\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, black people are all black 1
In [4]:
df.tail(20)
Out[4]:
prompt group prompt_label
250931 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250932 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250933 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250934 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250935 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250936 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250937 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250938 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250939 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250940 - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t women 0
250941 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250942 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250943 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250944 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250945 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250946 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250947 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250948 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250949 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
250950 - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he women 0
In [5]:
print(df.iloc[0,0])
- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important 
In [6]:
filtered_df = df[df['prompt_label'] == 0]
unique_values = filtered_df['group'].unique()
print(unique_values)
['black' 'asian' 'native_american' 'muslim' 'latino' 'chinese' 'lgbtq'
 'mental_dis' 'physical_dis' 'mexican' 'jewish' 'women' 'middle_east']
In [7]:
df["group"].unique()
Out[7]:
array(['black', 'asian', 'native_american', 'muslim', 'latino', 'jewish',
       'chinese', 'lgbtq', 'mental_dis', 'physical_dis', 'mexican',
       'women', 'middle_east'], dtype=object)
In [8]:
df["prompt_label"].unique()
Out[8]:
array([1, 0], dtype=int64)

checking empty cells¶

In [9]:
df.isna().sum()
Out[9]:
prompt          0
group           0
prompt_label    0
dtype: int64

working on duplicates¶

In [10]:
df.duplicated()
Out[10]:
0         False
1         False
2         False
3         False
4         False
          ...  
250946     True
250947     True
250948     True
250949     True
250950     True
Length: 250951, dtype: bool
In [11]:
df.shape
Out[11]:
(250951, 3)
In [12]:
# dropping duplicate rows from the dataset and saving it to the dataframe itself
df.drop_duplicates(inplace = True, keep='first') 
In [13]:
df.shape
Out[13]:
(17550, 3)

visualising the data¶

In [14]:
# Count the distribution of each group
group_counts = df['group'].value_counts().reset_index()
group_counts.columns = ['group', 'count']

# Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=group_counts, x='group', y='count', palette='viridis')

# Add count labels
for index, row in group_counts.iterrows():
    ax.text(index, row['count'], f'{row["count"]}', color='black', ha="center")

# Add labels and title
plt.xlabel('Group')
plt.ylabel('Count')
plt.title('Distribution of Groups (Count)')
plt.xticks(rotation=45)

# Show the plot
plt.show()
C:\Users\gazal\AppData\Local\Temp\ipykernel_25888\1244444858.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=group_counts, x='group', y='count', palette='viridis')
  • there are very less data points for asian, jewish, black, native_american, latino
  • if the model isn't able to predict these categories effectively we should later on consider introducing more data points for these categories
  • another approach we can follow is train the model on equal number of data points for each category by taking same number of data points for each category as much as there are in the category with least data points
In [15]:
# Count the distribution of each group
group_counts = df['prompt_label'].value_counts().reset_index()
group_counts.columns = ['prompt_label', 'count']

# Step 2: Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=group_counts, x='prompt_label', y='count', palette='viridis')

# Add count labels
for index, row in group_counts.iterrows():
    ax.text(index, row['count'], f'{row["count"]}', color='black', ha="center")

# Add labels and title
plt.xlabel('prompt_label')
plt.ylabel('Count')
plt.title('Distribution of Groups (Count)')
plt.xticks()

# Show the plot
plt.show()
C:\Users\gazal\AppData\Local\Temp\ipykernel_25888\2238513481.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=group_counts, x='prompt_label', y='count', palette='viridis')
  • the data is not imbalanced in terms of whether the text is discriminatory or not

data preprocessing and feature engineering¶

In [16]:
# Text preprocessing
def preprocess_text(text):
    
    # converting the text into lower case
    text = text.lower()
    
    # Debugging: Print text before removing newlines
    #print(f"Before removing newlines: {repr(text)}")
    

    # removing \n from the text as it just indicate the end of the line in csv file
    text = re.sub(r'\\n', '', text)

    # Debugging: Print text after removing newlines
    #print(f"After removing newlines: {repr(text)}")

    # removing punctuation from the text
    text = re.sub(f'[{string.punctuation}]', '', text)

    # tokeinising the text
    tokens = word_tokenize(text)

    # removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Joining the tokens back to get the cleaned text
    text = ' '.join(tokens)

    return text, tokens

df[['processed_prompt', 'tokens']] = df['prompt'].apply(preprocess_text).apply(pd.Series)
In [17]:
df.shape
Out[17]:
(17550, 5)
In [18]:
df.head()
Out[18]:
prompt group prompt_label processed_prompt tokens
0 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important black 1 vast major africanamerican poor stupid spend money drug unproduct thing natur le product caucasian counterpart africanamerican becom vastli import [vast, major, africanamerican, poor, stupid, spend, money, drug, unproduct, thing, natur, le, product, caucasian, counterpart, africanamerican, becom, vastli, import]
1 - black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, black 1 black peopl middl age alway watermelon musicallyinclin simplemind brute africanamerican shouldnt allow vote drive theyr stupid give valid opinion whether scientist janitor [black, peopl, middl, age, alway, watermelon, musicallyinclin, simplemind, brute, africanamerican, shouldnt, allow, vote, drive, theyr, stupid, give, valid, opinion, whether, scientist, janitor]
2 - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- black people from the middle ages ha black 1 mani african american savag look like hang tree eat watermelon food stapl africanamerican peopl spend time finetun scienc brutish thug black peopl middl age ha [mani, african, american, savag, look, like, hang, tree, eat, watermelon, food, stapl, africanamerican, peopl, spend, time, finetun, scienc, brutish, thug, black, peopl, middl, age, ha]
3 - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs black 1 vast major africanamerican poor stupid spend money drug unproduct thing black program love watermelon submit stronger race crave author like dog [vast, major, africanamerican, poor, stupid, spend, money, drug, unproduct, thing, black, program, love, watermelon, submit, stronger, race, crave, author, like, dog]
4 - whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- black 1 whether scientist janitor black peopl obedi brute want bedmak chore africanamerican crimin inher illog allaround stupid inher good commit crime [whether, scientist, janitor, black, peopl, obedi, brute, want, bedmak, chore, africanamerican, crimin, inher, illog, allaround, stupid, inher, good, commit, crime]
In [19]:
# checking the class separability
# Train word2vec model on the tokenized text, this model is getting trained on whole data so we will not consider this model for further training purposes
word2vec_model = Word2Vec(
    df['tokens'],
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=0
)

# creating word2vec feature vectors
def get_word2vec_vectors(text, model):
    vector = np.zeros(model.vector_size)
    count = 0
    for word in text:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

df['word2vec_vector'] = df['tokens'].apply(lambda x: get_word2vec_vectors(x, word2vec_model))

# Combine word2vec vectors with other features if necessary
X = np.vstack(df['word2vec_vector'].values)
y = df['prompt_label']
y_1=df['group']
print("Word2Vec feature vectors created.")
print(X.shape)




# Dimensionality Reduction using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot PCA results for label
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
plt.title('PCA of Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()





# computing Silhouette Score to quantify the class separability for label
silhouette_avg = silhouette_score(X, y)
print(f'Silhouette Score for label: {silhouette_avg:.2f}')
Word2Vec feature vectors created.
(17550, 100)
Silhouette Score for label: 0.05
In [20]:
# Plot PCA results for group
plt.figure(figsize=(12,12))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_1, palette='viridis')
plt.title('PCA of Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


# computing Silhouette Score to quantify the class separability for group
silhouette_avg = silhouette_score(X, y_1)
print(f'Silhouette Score for group: {silhouette_avg:.2f}')
Silhouette Score for group: 0.28

splitting the data¶

In [21]:
# Split the data into training and test sets
X_train, X_test, y_train_label, y_test_label, y_train_group, y_test_group = train_test_split(
    df['tokens'], df['prompt_label'], df['group'], test_size=0.35, random_state=50)
In [22]:
X_test.shape
Out[22]:
(6143,)
  • made sure that there are enough data points(>5000) in the test set

training the model for prompt_label¶

In [23]:
model = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)
In [24]:
# defining the function to get the sentence vector
def get_sentence_vector(tokens, model):
    # Average the word vectors for each token in the sentence
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)
In [25]:
# Create sentence vectors for training and test sets
X_train_vect = np.array([get_sentence_vector(tokens, model) for tokens in X_train])
X_test_vect = np.array([get_sentence_vector(tokens, model) for tokens in X_test])
In [26]:
# Train SVM Model for prompt_label
svm_model_label = SVC(kernel='linear', probability=True)
start_time = time.time()
svm_model_label.fit(X_train_vect, y_train_label)
end_time = time.time()  

train_time = end_time - start_time  
print(f'Training time: {train_time} seconds')
Training time: 6.478392601013184 seconds

evaluating the model for prompt_label on train set¶

In [27]:
# Evaluate the Model for prompt_label on train set
y_pred_label_train = svm_model_label.predict(X_train_vect)


accuracy_label_train = accuracy_score(y_train_label, y_pred_label_train)
print(f'Accuracy: {accuracy_label_train}')
Accuracy: 0.9928990970456737
In [28]:
# calculating f1 score for each class
f1_label_train = f1_score(y_train_label, y_pred_label_train, average=None)  
print(f'F1 score for class 0 : {f1_label_train[0]}')
print(f'F1 score for class 1 : {f1_label_train[1]}')
F1 score for class 0 : 0.9930999233324814
F1 score for class 1 : 0.9926862302483069
In [29]:
# Plot confusion matrix for label classification
cm_label_train = confusion_matrix(y_train_label, y_pred_label_train)
disp_label_train = ConfusionMatrixDisplay(confusion_matrix=cm_label_train, display_labels=svm_model_label.classes_)
disp_label_train.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Label Classification')
plt.show()
In [30]:
# calculating auc
y_pred_label_prob_train = svm_model_label.predict_proba(X_train_vect)[:, 1]
auc_label_train = roc_auc_score(y_train_label, y_pred_label_prob_train) 
print(f'AUC for label classification: {auc_label_train}')
AUC for label classification: 0.9993581461041509
In [33]:
# plotting ROC curve for label classification
fpr, tpr, _ = roc_curve(y_train_label, y_pred_label_prob_train)
plt.figure()
plt.plot(fpr, tpr, label=f'Label AUC = {auc_label_train:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Label Classification')
plt.legend(loc='best')
plt.show()

evaluating the model for prompt_label on test set¶

In [34]:
# Evaluate the Model for prompt_label on test set
y_pred_label = svm_model_label.predict(X_test_vect)


accuracy_label = accuracy_score(y_test_label, y_pred_label)
print(f'Accuracy: {accuracy_label}')
Accuracy: 0.9892560638124694
In [35]:
# calculating f1 score for each class
f1_label = f1_score(y_test_label, y_pred_label, average=None)  
print(f'F1 score for class 0 : {f1_label[0]}')
print(f'F1 score for class 1 : {f1_label[1]}')
F1 score for class 0 : 0.9894534995206137
F1 score for class 1 : 0.9890510948905109
In [36]:
# Plot confusion matrix for label classification
cm_label = confusion_matrix(y_test_label, y_pred_label)
disp_label = ConfusionMatrixDisplay(confusion_matrix=cm_label, display_labels=svm_model_label.classes_)
disp_label.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Label Classification')
plt.show()
In [37]:
# calculating auc
y_pred_label_prob = svm_model_label.predict_proba(X_test_vect)[:, 1]
auc_label = roc_auc_score(y_test_label, y_pred_label_prob) 
print(f'AUC for label classification: {auc_label}')
AUC for label classification: 0.9990541688360852
In [38]:
# plotting ROC curve for label classification
fpr, tpr, _ = roc_curve(y_test_label, y_pred_label_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'Label AUC = {auc_label:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Label Classification')
plt.legend(loc='best')
plt.show()
In [39]:
# Implementing LIME for label classification
#explainer = LimeTextExplainer(class_names=svm_model_label.classes_)
explainer = LimeTextExplainer(class_names=['negative', 'positive'])


def preprocess_text_lime(text):
    
    # converting the text into lower case
    text = text.lower()
    
    # Debugging: Print text before removing newlines
    #print(f"Before removing newlines: {repr(text)}")
    

    # removing \n from the text as it just indicate the end of the line in csv file
    text = re.sub(r'\\n', '', text)

    # Debugging: Print text after removing newlines
    #print(f"After removing newlines: {repr(text)}")

    # removing punctuation from the text
    text = re.sub(f'[{string.punctuation}]', '', text)

    # tokeinising the text
    tokens = word_tokenize(text)

    # removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Joining the tokens back to get the cleaned text
    text = ' '.join(tokens)
    

    return text
# Function to predict probabilities for LIME
def predict_proba_for_lime(texts):
    tokens = [preprocess_text_lime(text) for text in texts]
    
    vectors = np.array([get_sentence_vector(token, model) for token in tokens])
    return np.array(svm_model_label.predict_proba(vectors))

# Explain an instance
idx = 0  
exp = explainer.explain_instance(df['prompt'][idx], predict_proba_for_lime, num_features=10)
exp.show_in_notebook(text=True)

training the model for discriminatory group¶

In [40]:
# Train SVM Model for group
svm_model_group = SVC(kernel='linear', probability=True)
start_time = time.time()
svm_model_group.fit(X_train_vect, y_train_group)
end_time = time.time()  

train_time = end_time - start_time  
print(f'Training time: {train_time} seconds')
Training time: 1.9182019233703613 seconds

evaluating the model for discriminatory group on train set¶

In [41]:
# Evaluate the Model for discriminatory group on train set
y_pred_group_train = svm_model_group.predict(X_train_vect)


accuracy_group_train = accuracy_score(y_train_group, y_pred_group_train)
print(f'Accuracy: {accuracy_label_train}')
Accuracy: 0.9928990970456737
In [42]:
# Print F1 scores for each group

f1_group_train = f1_score(y_train_group, y_pred_group_train, average=None) 
for group, f1 in zip(np.unique(y_train_group), f1_group_train):
    print(f'F1 score for group {group}: {f1}')
F1 score for group asian: 0.9991941982272361
F1 score for group black: 1.0
F1 score for group chinese: 0.999195494770716
F1 score for group jewish: 1.0
F1 score for group latino: 1.0
F1 score for group lgbtq: 0.9995785924989464
F1 score for group mental_dis: 0.9991452991452991
F1 score for group mexican: 0.9952153110047847
F1 score for group middle_east: 0.9992088607594937
F1 score for group muslim: 0.9995982322217759
F1 score for group native_american: 0.9972451790633609
F1 score for group physical_dis: 1.0
F1 score for group women: 1.0
In [43]:
# plotting confusion matrix for group classification
cm_group_train = confusion_matrix(y_train_group, y_pred_group_train)
fig, ax = plt.subplots(figsize=(22, 20))
disp_group_train = ConfusionMatrixDisplay(confusion_matrix=cm_group_train, display_labels=svm_model_group.classes_)
disp_group_train.plot(cmap=plt.cm.Blues, ax=ax)
plt.title('Confusion Matrix for Group Classification')
plt.show()
In [44]:
# calculating auc
y_pred_group_prob_train = svm_model_group.predict_proba(X_train_vect)
auc_group_train = roc_auc_score(y_train_group, y_pred_group_prob_train, multi_class='ovr', average='weighted')  
print(f'AUC for group classification: {auc_group_train}')
AUC for group classification: 0.9999974580939129
In [45]:
# plotting ROC curve for group classification (one-vs-rest for multi-class)
plt.figure()
for i, group in enumerate(np.unique(y_train_group)):
    fpr, tpr, _ = roc_curve(y_train_group == group, y_pred_group_prob_train[:, i])
    plt.plot(fpr, tpr, label=f'Group {group} AUC = {roc_auc_score(y_train_group == group, y_pred_group_prob_train[:, i]):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Group Classification')
plt.legend(loc='best')
plt.show()

evaluating the model for discriminatory group on test set¶

In [46]:
# Evaluate the Model for discriminatory group on train set
y_pred_group = svm_model_group.predict(X_test_vect)


accuracy_group_train = accuracy_score(y_test_group, y_pred_group)
print(f'Accuracy: {accuracy_group_train}')
Accuracy: 0.9991860654403386
In [47]:
# Print F1 scores for each group

f1_group = f1_score(y_test_group, y_pred_group, average=None) 
for group, f1 in zip(np.unique(y_test_group), f1_group):
    print(f'F1 score for group {group}: {f1}')
F1 score for group asian: 1.0
F1 score for group black: 0.9976133651551313
F1 score for group chinese: 1.0
F1 score for group jewish: 0.9965753424657535
F1 score for group latino: 0.9894736842105264
F1 score for group lgbtq: 0.9985007496251874
F1 score for group mental_dis: 1.0
F1 score for group mexican: 0.9959514170040485
F1 score for group middle_east: 1.0
F1 score for group muslim: 1.0
F1 score for group native_american: 1.0
F1 score for group physical_dis: 1.0
F1 score for group women: 1.0
In [48]:
# plotting confusion matrix for group classification
cm_group = confusion_matrix(y_test_group, y_pred_group)
fig, ax = plt.subplots(figsize=(22, 20))
disp_group = ConfusionMatrixDisplay(confusion_matrix=cm_group, display_labels=svm_model_group.classes_)
disp_group.plot(cmap=plt.cm.Blues, ax=ax)
plt.title('Confusion Matrix for Group Classification')
plt.show()
In [49]:
# calculating auc
y_pred_group_prob = svm_model_group.predict_proba(X_test_vect)
auc_group = roc_auc_score(y_test_group, y_pred_group_prob, multi_class='ovr', average='weighted')  
print(f'AUC for group classification: {auc_group}')
AUC for group classification: 0.9999845754505324
In [50]:
# plotting ROC curve for group classification (one-vs-rest for multi-class)
plt.figure()
for i, group in enumerate(np.unique(y_test_group)):
    fpr, tpr, _ = roc_curve(y_test_group == group, y_pred_group_prob[:, i])
    plt.plot(fpr, tpr, label=f'Group {group} AUC = {roc_auc_score(y_test_group == group, y_pred_group_prob[:, i]):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Group Classification')
plt.legend(loc='best')
plt.show()
In [51]:
# Implementing LIME for group classification
explainer_group = LimeTextExplainer(class_names=svm_model_group.classes_)

# Function to predict probabilities for LIME
def predict_proba_for_lime_group(texts):
    tokens = [preprocess_text_lime(text) for text in texts]
    vectors = np.array([get_sentence_vector(token, model) for token in tokens])
    return svm_model_group.predict_proba(vectors)

# Explain an instance
exp_group = explainer_group.explain_instance(df['prompt'][idx], predict_proba_for_lime_group, num_features=10)
exp_group.show_in_notebook(text=True)

next steps¶

  • we have dropped a massive amount of data due to duplicates, check how the model performs on original data with duplicates
  • if the text is not discriminatory then why do we still have a discriminatory group for it, should't the value of discriminatory group be somethingĀ likeĀ 'none, try replacing the discriminating group with none where the is classified as not discriminatory
In [ ]: